[csapp] Lab4 Cache Lab

Lab4 Cache Lab

1. Writing a cache simulator

这个 simulator 非常的简单,完全模拟 LRU 算法就行了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/* @author: chenghua.Wang
* @time: 2022/02/17
* @breif: A LRU(least recently used) cache simulator.
* A file of CSAPP cache lab. Puzzle A.
* @note: compiled on ubuntu20.04 x86-64. gcc 9.3.0.
*/
#include "cachelab.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <limits.h>
#include <inttypes.h>
#define True 1
#define Fasle 0
#define _string_buf_ 1024
typedef int32_t bool;

// data struct
typedef struct{
bool h, v;
int32_t s, E, b;
char t[_string_buf_];
}_argument_unit_;

typedef struct{
int64_t hit_cnt, miss_cnt, eviction_cnt;
}_cnt_unit_;

typedef struct{
int32_t valid; // bits
int32_t tag;
int32_t stamp; // used to count Least recently block.
}_cache_line_;

typedef struct{
_cache_line_ *cache_lines;
}_cache_set_, *_cache_set_p_;

// global data init.
_argument_unit_ arg_unit;
_cnt_unit_ cnt_unit;

// argument process function set.
void help_arg();
bool check_arg();

// cache update and init function set.
_cache_set_p_ init_cache();
void cache_update(uint32_t address, _cache_set_p_ cache_sets);
void cache_update_stamp(_cache_set_p_ cache_sets);
void simulate_cache();

int main(int argc, char *argv[]){
arg_unit.h = arg_unit.v = Fasle;
arg_unit.b = arg_unit.E = arg_unit.s = 0;
cnt_unit.eviction_cnt = cnt_unit.hit_cnt = cnt_unit.miss_cnt = 0;
int32_t opt;
while(-1 != (opt = (getopt(argc, argv, "hvs:E:b:t:")))){
switch (opt)
{
case 'h':
help_arg();
arg_unit.h = True;
break;
case 'v':
arg_unit.v = True;
help_arg();
break;
case 's':
arg_unit.s = atoi(optarg);
break;
case 'E':
arg_unit.E = atoi(optarg);
break;
case 'b':
arg_unit.b = atoi(optarg);
break;
case 't':
strcpy(arg_unit.t, optarg);
break;
default:
help_arg();
break;
}
}
check_arg();
simulate_cache();
printSummary(cnt_unit.hit_cnt, cnt_unit.miss_cnt, cnt_unit.eviction_cnt);
return 0;
}

// argument process function set.
void help_arg(){
printf("Usage: ./csim-ref [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n");
printf("-h: Optional help flag that prints usage info\n");
printf("-v: Optional verbose flag that displays trace info\n");
printf("-s <s>: Number of set index bits\n");
printf("-E <E>: Associativity (number of lines per set)\n");
printf("-b <b>: Number of block bits\n");
printf("-t <tracefile>: Name of the valgrind trace to replay\n");
return;
}

bool check_arg(){
if(arg_unit.s<=0 || arg_unit.E<=0 || arg_unit.b<=0 || arg_unit.t==NULL) return Fasle;
else return True;
}

// cache update and init function set.
_cache_set_p_ init_cache(){
/* get memory for sets and cache_line
* s is Number of set index bits. so S = 2^s is the number of sets.
* E is Associativity (number of lines per set)
* b is Number of block bits
*/
int32_t tmp_S = 1 << arg_unit.s;
_cache_set_p_ _ret_cache_sets = (_cache_set_p_)malloc(sizeof(_cache_set_) * tmp_S);
for (int32_t i = 0; i < tmp_S; ++i){
_ret_cache_sets[i].cache_lines = (_cache_line_ *)malloc(sizeof(_cache_line_) * arg_unit.E);
for (int32_t j = 0; j < arg_unit.E; ++j){
_ret_cache_sets[i].cache_lines[j].valid = 0;
_ret_cache_sets[i].cache_lines[j].tag = -1;
_ret_cache_sets[i].cache_lines[j].stamp = -1;
}
}
return _ret_cache_sets;
}

void cache_update(uint32_t address, _cache_set_p_ cache_sets){
// update cache, use address.
// b is the Number of block bits
int32_t set_index_address = (address >> arg_unit.b) & ((-1U) >> (64 - arg_unit.s));
int32_t tag_address = address >> (arg_unit.b + arg_unit.s);
// hit.
for (int32_t i = 0; i < arg_unit.E; ++i){
if (cache_sets[set_index_address].cache_lines[i].tag == tag_address){
cache_sets[set_index_address].cache_lines[i].stamp = 0;
cnt_unit.hit_cnt ++;
return;
}
}
// miss but have blank line.
for (int32_t i = 0; i < arg_unit.E; ++i){
if (cache_sets[set_index_address].cache_lines[i].valid == 0){
cache_sets[set_index_address].cache_lines[i].valid = 1;
cache_sets[set_index_address].cache_lines[i].tag = tag_address;
cache_sets[set_index_address].cache_lines[i].stamp = 0;
cnt_unit.miss_cnt ++;
return;
}
}
// miss and need to be replaced.
cnt_unit.miss_cnt ++;
cnt_unit.eviction_cnt ++;
int32_t max_stamp = INT_MIN;
int32_t max_stamp_idx = -1;
for (int32_t i = 0; i < arg_unit.E; ++i){
if (cache_sets[set_index_address].cache_lines[i].stamp > max_stamp){
max_stamp = cache_sets[set_index_address].cache_lines[i].stamp;
max_stamp_idx = i;
}
}
cache_sets[set_index_address].cache_lines[max_stamp_idx].tag = tag_address;
cache_sets[set_index_address].cache_lines[max_stamp_idx].stamp = 0;
return;
}
void cache_update_stamp(_cache_set_p_ cache_sets){
int32_t tmp_S = 1 << arg_unit.s;
for (int32_t i = 0; i < tmp_S; ++i){
for (int32_t j = 0; j < arg_unit.E; ++j){
cache_sets[i].cache_lines[j].stamp ++;
}
}
return;
}

void simulate_cache(){
/* Start simulate.
* Read data from file(t).
* The format in file is <op + address + size>
* "I 0400d7d4,8"
* " M 0421c7f0,4"
* " L 04f6b868,8"
* " S 7ff0005c8,8"
* ignore I instruction.
*/
int32_t tmp_S = 1 << arg_unit.s;
FILE* fp = fopen(arg_unit.t, "r");
if(fp == NULL){
printf("open error");
exit(-1);
}
_cache_set_p_ cache_sets = init_cache();
char op;
uint32_t address;
int32_t size;
while(fscanf(fp, " %c %xu,%d\n", &op, &address, &size) > 0){
switch (op){
case 'L':
cache_update(address, cache_sets);
break;
case 'M':
cache_update(address, cache_sets);
case 'S':
cache_update(address, cache_sets);
break;
}
cache_update_stamp(cache_sets);
}
fclose(fp);
for (int32_t i = 0; i < tmp_S; ++i){
free(cache_sets[i].cache_lines);
}
free(cache_sets);
}

2 Optimizing matrix transpose

首先测试的前提是 cache 32-set,each set has only one cache line,and each cache line has 32 bytes. 最多使用 12 个变量

2.1 32 x 32

先使用最原始的方法:

1
2
3
4
5
6
7
8
9
if (M == 32){
int tmp;
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
tmp = A[i][j];
B[j][i] = tmp;
}
}
}

显然,这样得到的结果是糟糕的。

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:870, misses:1183, evictions:1151

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:870, misses:1183, evictions:1151

Summary for official submission (func 0): correctness=1 misses=1183

TEST_TRANS_RESULTS=1:1183

可以看到 miss 数达到了惊人的 1183 个。

8x8 的分块

对于 32 x 32 的矩阵非常容易想到可以使用 8 x 8 的分块。因为 cache line 一行能够存储的一共就是 8 Byte,在 32 x 32 的矩阵中,一行有 32 个 int, 一共就是需要 4 个 cache line 可以,那么所有 cache 可以装下矩阵的 8 行。所以这里使用 8 x 8 分块。

1
2
3
4
5
6
7
if (M == 32){
for (int i = 0; i < N; i += 8)
for (int j = 0; j < M; j += 8)
for (int m = i; m < i + 8; ++m)
for (int n = j; n < j + 8; ++n)
B[n][m] = A[m][n];
}

这样 8 x 8 的一组,就可以让 cache 被充分的利用起来。

实验结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1710, misses:343, evictions:311

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:870, misses:1183, evictions:1151

Summary for official submission (func 0): correctness=1 misses=343

TEST_TRANS_RESULTS=1:343

8 x 8 分块最优性能探讨

为了探讨最优性能,我再理一遍 cache 的运行过程。在 A,B两个矩阵中,cache set 是这样被分配的。

$$
\begin{array}{l}
0&0&0&0&0&0&0&0 \newline
4&4&4&4&4&4&4&4 \newline
8&8&8&8&8&8&8&8\newline
12&12&12&12&12&12&12&12\newline
16&16&16&16&16&16&16&16\newline
20&20&20&20&20&20&20&20\newline
24&24&24&24&24&24&24&24\newline
28&28&28&28&28&28&28&28\newline
\end{array}
$$

这个 8x8 的矩阵表示的就是 第一个分块矩阵的 set 的位置。我们以这个小的矩阵为例,解释原始方法和上文中的 8x8 方法为什么有大量的 cache miss。

当我们使用原始方法的时候,我们首先访问的是A[0][0],这会造成一次 cold miss,然后写入 B[0][0], 又会造成一次 conflict miss,接下来访问A[0][1],会造成一次 conflict miss,在写入 B[1][0] 会造成一次 cold miss,我们可以发现有大量的 miss 被用在 A,B 矩阵的转换的过程中了。

那么再看 8x8 的矩阵的情况。8x8 的矩阵就是为了防止 A,B之间大量的冲突。可以发现在 8x8 的矩阵中,除了对角线,其他A,B中相对应的元素都是在不同的 set 中的。这样可以减少大量的 conflict miss。每个小块中,先按照行读A,再按照列写B,在这样的情况下,每个A的8x8 的块中只有第一列会出现 cold miss,每个B的 8x8 的块中只有第一列会出现 cold miss,但是 B的对角线上会出现 conflict miss,那么我们可以估计出总的 miss 数量应该是 在A中有 16 x 8 = 128 misses,在B中有 16 x 8 + 28(对角线) = 156 misses,那么如果能够消除对角线上的 conflict miss,我们就可以得到 总计 256 次 misses 的最优性能。

考虑到我们可以使用 最多 12 个额外的参数,我们可以使用另外的 8 个参数来避免conflict 问题。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
if(M == 32){
int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
for (int i = 0; i < 32; i += 8)
for(int j = 0; j < 32; j += 8)
for(int k = i; k < (i + 8); ++k){
tmp1 = A[k][j];
tmp2 = A[k][j+1];
tmp3 = A[k][j+2];
tmp4 = A[k][j+3];
tmp5 = A[k][j+4];
tmp6 = A[k][j+5];
tmp7 = A[k][j+6];
tmp8 = A[k][j+7];
B[j][k] = tmp1;
B[j+1][k] = tmp2;
B[j+2][k] = tmp3;
B[j+3][k] = tmp4;
B[j+4][k] = tmp5;
B[j+5][k] = tmp6;
B[j+6][k] = tmp7;
B[j+7][k] = tmp8;
}
}

结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:1766, misses:287, evictions:255

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:870, misses:1183, evictions:1151

Summary for official submission (func 0): correctness=1 misses=287

TEST_TRANS_RESULTS=1:287

2.2 64 x 64

对 64 x 64 的矩阵来说,每行有 64 个 int,则 cache 只能存矩阵的 4 行了,所以 8x8 的分块肯定是有问题的,如果使用 8x8 的分块,一定会在写 B 的时候造成冲突。我们先用 4x4 的矩阵分块跑一下试试。

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:6498, misses:1699, evictions:1667

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691

Summary for official submission (func 0): correctness=1 misses=1699

TEST_TRANS_RESULTS=1:1699

实验结果并不是很理想。

优化

我们可以使用先 8 分块,再使用 4 分块的方法来实现。我们仍然从 set 冲突的角度上来考虑。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
if (M == 64)
{
int i, j, x, y;
int x1, x2, x3, x4, x5, x6, x7, x8;
for (i = 0; i < N; i += 8)
for (j = 0; j < M; j += 8){
for (x = i; x < i + 4; ++x){
x1 = A[x][j]; x2 = A[x][j+1]; x3 = A[x][j+2]; x4 = A[x][j+3];
x5 = A[x][j+4]; x6 = A[x][j+5]; x7 = A[x][j+6]; x8 = A[x][j+7];

B[j][x] = x1; B[j+1][x] = x2; B[j+2][x] = x3; B[j+3][x] = x4;
B[j][x+4] = x5; B[j+1][x+4] = x6; B[j+2][x+4] = x7; B[j+3][x+4] = x8;
}
for (y = j; y < j + 4; ++y){
x1 = A[i+4][y]; x2 = A[i+5][y]; x3 = A[i+6][y]; x4 = A[i+7][y];
x5 = B[y][i+4]; x6 = B[y][i+5]; x7 = B[y][i+6]; x8 = B[y][i+7];

B[y][i+4] = x1; B[y][i+5] = x2; B[y][i+6] = x3; B[y][i+7] = x4;
B[y+4][i] = x5; B[y+4][i+1] = x6; B[y+4][i+2] = x7; B[y+4][i+3] = x8;
}
for (x = i + 4; x < i + 8; ++x){
x1 = A[x][j+4]; x2 = A[x][j+5]; x3 = A[x][j+6]; x4 = A[x][j+7];
B[j+4][x] = x1; B[j+5][x] = x2; B[j+6][x] = x3; B[j+7][x] = x4;
}
}
}

最终结果

1
2
3
4
5
6
7
8
9
10
11
12
13
Function 0 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 0 (Transpose submission): hits:9066, misses:1179, evictions:1147

Function 1 (2 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691

Summary for official submission (func 0): correctness=1 misses=1179

TEST_TRANS_RESULTS=1:1179

2.3 61 x 67

考虑其他的分块大小。

最简单的方法就是做 8x8 的分块。分块的方式和上文中的一样,但是任然存在 A,B之间反复conflict miss 的问题。

最终的结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
Part A: Testing cache simulator
Running ./test-csim
Your simulator Reference simulator
Points (s,E,b) Hits Misses Evicts Hits Misses Evicts
3 (1,1,1) 9 8 6 9 8 6 traces/yi2.trace
3 (4,2,4) 4 5 2 4 5 2 traces/yi.trace
3 (2,1,4) 2 3 1 2 3 1 traces/dave.trace
3 (2,1,3) 167 71 67 167 71 67 traces/trans.trace
3 (2,2,3) 201 37 29 201 37 29 traces/trans.trace
3 (2,4,3) 212 26 10 212 26 10 traces/trans.trace
3 (5,1,5) 231 7 0 231 7 0 traces/trans.trace
6 (5,1,5) 265189 21775 21743 265189 21775 21743 traces/long.trace
27

Part B: Testing transpose function
Running ./test-trans -M 32 -N 32
Running ./test-trans -M 64 -N 64
Running ./test-trans -M 61 -N 67

Cache Lab summary:
Points Max pts Misses
Csim correctness 27.0 27
Trans perf 32x32 8.0 8 287
Trans perf 64x64 8.0 8 1179
Trans perf 61x67 10.0 10 1905
Total points 53.0 53
作者

WangCH

发布于

2022-03-05

更新于

2022-03-05

许可协议

评论